# load package(s) first
library(dplyr)
R has a sample dataframe “mtcars”.
mtcars
ggplot2 packageggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics. You provide the data, tell ggplot2 how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details. (https://ggplot2.tidyverse.org)
# install.packages("ggplot2") # install only once
library(ggplot2) # load every session
ggplot(data = dataset, mapping = aes(x = xcol, y = ycol)) + geom_histogram()
aes() specifies what columns of the data table will be used as visual attributes of graphical elements in the plot.colour |
Coloring outline |
fill |
Coloring inside |
linetype |
Line type |
shape |
Shape of point |
alpha |
Transparency |
geom_point() |
Scatter plot |
geom_bar() |
Bar chart |
geom_line() |
Line plot |
geom_histogram() |
Histogram |
geom_boxplot() |
Box plot |
Inside aes(): variables from dataframe.
Outside aes(): options not from dataframe. —
geom_point()ggplot(mtcars, aes(x = mpg, y = hp)) + geom_point()
geom_bar()ggplot(mtcars, aes(x = cyl)) + geom_bar()
geom_line()ggplot(mtcars, aes(x = mpg, y = wt)) + geom_line()
geom_histogram()ggplot(mtcars, aes(x = wt)) + geom_histogram()
geom_boxplot()Use factor() to treat cyl as a discrete (categorical) variable.
ggplot(mtcars, aes(x = factor(cyl), y = wt)) + geom_boxplot()
ggplot(mtcars, aes(x = mpg, y = hp, colour = cyl)) +
geom_point(aes(color = factor(gear))) +
geom_smooth(method = "lm") +
labs(title = "Miles per Gallon -vs- Horsepower")
storms dataframehead(storms)
How many records are there in each year?
ggplot(storms, aes(x = year)) + geom_bar()
# this works as well
ggplot(storms) + geom_bar(aes(x = year))
Then, how many storms are there in each year?
Need some operation.
distinct(group_by(select(storms, year, name), year))
storms_year_name <- distinct(group_by(select(storms, year, name), year))
ggplot(storms_year_name) + geom_bar(aes(x = year))
# check
count(storms_year_name)
storms75 <- filter(storms, year == 1975)
ggplot(storms75) + geom_histogram(aes(x = wind))
Change the bin width and compare.
ggplot(storms75, aes(x = wind)) + geom_histogram(binwidth = 5)
ggplot(storms75, aes(x = wind)) + geom_histogram(binwidth = 10)
There are three storms in 1975: Amy, Caroline, and Doris.
unique(pull(storms75, name))
[1] "Amy" "Caroline" "Doris"
Compare the wind speeds of the three.
ggplot(storms75, aes(x = name, y = wind)) + geom_boxplot()
ggplot(storms75, aes(x = wind)) + geom_density()
How is the distribution like?
ggplot(storms75, aes(x = wind, color = name)) + geom_density(aes(fill = name), alpha = 0.5)
To produce separated frames, use facet_wrap(). Facetting by name.
ggplot(storms75, aes(x = wind, color = name)) +
geom_density(aes(fill = name), alpha = 0.5) +
facet_wrap(~ name)
amy75 <- filter(storms75, name == "Amy")
head(amy75)
ggplot(data = amy75, aes(x = 1:nrow(amy75), y = wind)) +
geom_point() +
xlab("time (6 hours each)")
For chronological graph, line plot is commonly used.
ggplot(data = amy75, aes(x = 1:nrow(amy75), y = wind)) +
geom_point() +
geom_line() +
xlab("time")
Color by status.
ggplot(amy75, aes(x = 1:nrow(amy75), y = wind)) +
geom_line() +
geom_point(aes(color = status))
What about pressure?
ggplot(amy75, aes(x = 1:nrow(amy75), y = pressure)) +
geom_point(aes(color = status)) +
geom_line(linetype = "dashed")
Graphing pressure and taking into account the wind speed reflected in the size of points and line segments.
ggplot(amy75, aes(x = 1:nrow(amy75), y = pressure)) +
geom_line(aes(size = wind), color = "gray") +
geom_point(aes(color = status, size = wind))
ggplot(storms, aes(x = wind, y = pressure)) +
geom_point(aes(color = category))
ggplot(storms, aes(x = wind, y = pressure)) +
geom_point(aes(color = category)) +
facet_wrap(~ month)
ggplot(storms, aes(x = wind, y = pressure)) +
geom_point(aes(color = category), alpha = 0.2) +
facet_wrap(~ month)
storms80 <- filter(storms, year == 1980)
ggplot(storms80, aes(x = name, y = pressure)) +
geom_boxplot()
ggplot(storms80, aes(x = name, y = pressure)) +
geom_violin()
ggplot(storms, aes(x = wind, y = ts_diameter)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "lm", na.rm = TRUE)
Try geom_smooth() with method = lm to fit a least squares regression line.
Try geom_smooth() with method = loess to fit a local polynomial regression.
ggplot(storms, aes(x = wind, y = ts_diameter)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "loess", na.rm = TRUE)
ggplot(storms, aes(x = wind, y = ts_diameter)) +
geom_point(aes(color = status), alpha = 0.5, na.rm = TRUE) +
geom_smooth(method = "lm", na.rm = TRUE)
# one variable
ggplot(storms) + geom_qq(aes(sample = pressure))
ggplot(storms) + geom_bar(aes(x = status))
# two variables
ggplot(count(storms_year_name), aes(x = year, y = n)) + geom_line()
ggplot(storms, aes(x = wind, y = pressure)) + geom_density2d()
# three variables
ggplot(storms, aes(x = wind, y = pressure)) + geom_tile(aes(fill = status))